这里直接读取作者给定的第一个病人的Gene expression analysis: discovery patient PBMC,用的是 10x genomics 3’ Chromium expression assay
Following sequence alignment and filtering, a total of 12,874 cells were analyzed.
最后是 17,712 genes and 12,874 cells, 所以对计算机的考验很大,而且文章使用的是seurat2,所以这里我们选取并且安装seurat2
需要自行下载安装一些必要的R包! 而且需要注意版本 Seurat
因为大量学员在中国大陆,通常不建议大家使用下面的R包安装方法,建议是切换镜像后再下载R包。
参考:http://www.bio-info-trainee.com/3727.html
# 下面代码不运行。
# Enter commands in R (or R studio, if installed)
# Install the devtools package from Hadley Wickham
install.packages('devtools')
# Replace '2.3.0' with your desired version
devtools::install_version(package = 'Seurat', version = package_version('2.3.0'))
library(Seurat)
加载R包
rm(list = ls()) # clear the environment
#load all the necessary libraries
options(warn=-1) # turn off warning message globally
suppressMessages(library(Seurat))
start_time <- Sys.time()
# 如果觉得这里较慢,可以使用 data.table 包的 fread函数。
raw_dataPBMC <- read.csv('../Output_2018-03-12/GSE117988_raw.expMatrix_PBMC.csv.gz', header = TRUE, row.names = 1)
end_time <- Sys.time()
end_time - start_time
## Time difference of 1.172894 mins
# 通常电脑一分钟可以搞定。
dim(raw_dataPBMC)
## [1] 17712 12874
start_time <- Sys.time()
# 按照列,对每一个细胞进行内部归一化,主要是统一文库大小。
dataPBMC <- log2(1 + sweep(raw_dataPBMC, 2, median(colSums(raw_dataPBMC))/colSums(raw_dataPBMC), '*')) # Normalization
end_time <- Sys.time()
end_time - start_time
## Time difference of 30.52906 secs
# 下面是简单的字符串切割, ExtractField {Seurat}
head(colnames(dataPBMC))
## [1] "AAACCTGAGCGAAGGG.1" "AAACCTGAGGTCATCT.1" "AAACCTGAGTCCTCCT.1"
## [4] "AAACCTGCACCAGCAC.1" "AAACCTGGTAACGTTC.1" "AAACCTGGTAAGGATT.1"
timePoints <- sapply(colnames(dataPBMC), function(x) ExtractField(x, 2, '[.]'))
timePoints <-ifelse(timePoints == '1', 'PBMC_Pre',
ifelse(timePoints == '2', 'PBMC_EarlyD27',
ifelse(timePoints == '3', 'PBMC_RespD376', 'PBMC_ARD614')))
table(timePoints)
## timePoints
## PBMC_ARD614 PBMC_EarlyD27 PBMC_Pre PBMC_RespD376
## 4516 1592 2082 4684
# 可以看到是治疗前,加上治疗中的3个时间点,这4个分组信息。
简单看看表达矩阵的性质,主要是基因数量,细胞数量;
以及每个细胞表达基因的数量,和每个基因在多少个细胞里面表达。
dim(dataPBMC)
## [1] 17712 12874
# 可以看到,近2万的基因里面,
# 绝大部分基因只在一万多细胞的200个不到的表达
fivenum(apply(dataPBMC,1,function(x) sum(x>0) ))
## RP4-669L17.10 LAMB3 NAT10 AC093673.5 RPL21
## 1 6 50 207 12102
boxplot(apply(dataPBMC,1,function(x) sum(x>0) ))
# 可以看到,一万多细胞里面
# 绝大部分细胞只能检测不到500个基因。
fivenum(apply(dataPBMC,2,function(x) sum(x>0) ))
## CAAGAAATCGATCCCT.2 GGCCGATTCCAGAGGA.3 TCAACGAAGAGCTGGT.3
## 10 321 395
## TGCCAAAGTCTGCGGT.4 TCTGGAATCTATCGCC.3
## 481 2865
hist(apply(dataPBMC,2,function(x) sum(x>0) ))
start_time <- Sys.time()
# Create Seurat object
PBMC <- CreateSeuratObject(raw.data = dataPBMC,
min.cells = 1, min.genes = 0, project = '10x_PBMC') # already normalized
dim(dataPBMC)
## [1] 17712 12874
PBMC # 17,712 genes and 12,874 cells
## An object of class seurat in project 10x_PBMC
## 17712 genes across 12874 samples.
# 可以看到上面创建Seurat对象的那些参数并没有过滤基因或者细胞。
end_time <- Sys.time()
end_time - start_time
## Time difference of 8.503639 secs
# Add meta.data (nUMI and timePoints)
PBMC <- AddMetaData(object = PBMC,
metadata = apply(raw_dataPBMC, 2, sum),
col.name = 'nUMI_raw')
PBMC <- AddMetaData(object = PBMC, metadata = timePoints, col.name = 'TimePoints')
这里绘图,可以指定分组,前提是这个分组变量存在于meta信息里面,我们创建对象后使用函数添加了 TimePoints 属性,所以可以用来进行可视化。
这里是:‘TimePoints’
sce=PBMC
VlnPlot(object = sce,
features.plot = c("nGene", "nUMI"),
group.by = 'TimePoints', nCol = 2)
GenePlot(object = sce, gene1 = "nUMI", gene2 = "nGene")
可以看看高表达量基因是哪些
tail(sort(Matrix::rowSums(sce@raw.data)))
## EEF1A1 RPL13A RPLP1 TMSB4X B2M MALAT1
## 37203.12 38345.11 38977.99 43027.61 46854.95 62363.12
## 散点图可视化任意两个基因的一些属性(通常是细胞的度量)
# 这里选取两个基因。
tmp=names(sort(Matrix::rowSums(sce@raw.data),decreasing = T))
GenePlot(object = sce, gene1 = tmp[1], gene2 = tmp[2])
# 散点图可视化任意两个细胞的一些属性(通常是基因的度量)
# 这里选取两个细胞
CellPlot(sce,sce@cell.names[3],sce@cell.names[4],do.ident = FALSE)
很简单的流程,先ScaleData,再FindVariableGenes,然后根据找到的高变异基因进行RunPCA,再根据PCA结果进行FindClusters即可,最后再RunTSNE后进行可视化。
start_time <- Sys.time()
# 最耗费时间的步骤在这里。
PBMC <- ScaleData(object = PBMC, vars.to.regress = c('nUMI_raw'), model.use = 'linear', use.umi = FALSE)
## NormalizeData has not been run, therefore ScaleData is running on non-normalized values. Recommended workflow is to run NormalizeData first.
## ScaleData is running on non-normalized values. Recommended workflow is to run NormalizeData first.
##
## Time Elapsed: 53.2460291385651 secs
end_time <- Sys.time()
end_time - start_time
## Time difference of 1.309101 mins
start_time <- Sys.time()
PBMC <- FindVariableGenes(object = PBMC,
mean.function = ExpMean,
dispersion.function = LogVMR,
x.low.cutoff = 0.0125,
x.high.cutoff = 3,
y.cutoff = 0.5)
head(PBMC@var.genes)
## [1] "AP006222.2" "ISG15" "TNFRSF18" "TNFRSF4" "MMP23B"
## [6] "SMIM1"
length(PBMC@var.genes)
## [1] 1203
PBMC <- RunPCA(object = PBMC, pc.genes = PBMC@var.genes)
## [1] "PC1"
## [1] "S100A9" "CST3" "FTH1" "LYZ"
## [5] "S100A8" "SAT1" "AIF1" "GPX1"
## [9] "LST1" "FCN1" "CTSS" "CSTA"
## [13] "S100A12" "NEAT1" "LGALS2" "VCAN"
## [17] "RP11-1143G9.4" "TYMP" "FOS" "MNDA"
## [21] "PPBP" "SERPINA1" "AP1S2" "MS4A6A"
## [25] "MAFB" "CD14" "DUSP1" "NAMPT"
## [29] "ACTB" "MT-ND1"
## [1] ""
## [1] "NKG7" "IL32" "GNLY" "LTB" "GZMH" "KLRB1"
## [7] "CTSW" "ALAS2" "GZMB" "FGFBP2" "HBD" "AHSP"
## [13] "HLA-C" "CMC1" "CD247" "TRBC1" "KLRD1" "GZMM"
## [19] "PRF1" "IL2RG" "CD3E" "ACAP1" "LCK" "CD2"
## [25] "CA1" "TRDC" "SLC25A39" "ISG20" "BPGM" "EVL"
## [1] ""
## [1] ""
## [1] "PC2"
## [1] "PF4" "GNG11" "SDPR" "HIST1H2AC" "PPBP"
## [6] "TUBB1" "ACRBP" "RGS18" "CLU" "NRGN"
## [11] "SPARC" "GP9" "MAP3K7CL" "NGFRAP1" "TSC22D1"
## [16] "NCOA4" "TREML1" "MMD" "RUFY1" "PGRMC1"
## [21] "PTCRA" "CLEC1B" "HIST1H3H" "C6orf25" "CMTM5"
## [26] "ITGA2B" "TMEM40" "MYL9" "TUBA4A" "TAGLN2"
## [1] ""
## [1] "S100A9" "LYZ" "S100A8" "FCN1"
## [5] "CTSS" "AIF1" "LST1" "CSTA"
## [9] "VCAN" "S100A12" "RP11-1143G9.4" "LGALS2"
## [13] "NEAT1" "DUSP1" "FOS" "TYMP"
## [17] "SERPINA1" "MNDA" "CD14" "MAFB"
## [21] "MS4A6A" "NAMPT" "CST3" "SPI1"
## [25] "MCL1" "CLEC7A" "G0S2" "TKT"
## [29] "CFP" "LINC00936"
## [1] ""
## [1] ""
## [1] "PC3"
## [1] "NKG7" "HLA-C" "GZMB" "GNLY" "S100A4" "FGFBP2"
## [7] "GZMH" "CTSW" "ACTB" "PRF1" "KLRB1" "KLRD1"
## [13] "MT-ATP6" "CD247" "CMC1" "IL32" "FCGR3A" "MT-CYB"
## [19] "SPON2" "KLRF1" "TRDC" "GZMM" "RHOC" "CCL4"
## [25] "CLIC3" "LITAF" "MT-ND3" "C12orf75" "TRBC1" "BIN2"
## [1] ""
## [1] "ALAS2" "AHSP" "SNCA" "HBD" "SLC25A37" "CA1"
## [7] "HBM" "CD79A" "SELENBP1" "SLC25A39" "BLVRB" "BPGM"
## [13] "IGKC" "IGHD" "GMPR" "MPP1" "IGHM" "EIF1AY"
## [19] "TCL1A" "BNIP3L" "STRADB" "CD79B" "IGLC2" "MS4A1"
## [25] "IFIT1B" "FAM210B" "YBX3" "HLA-DQA1" "DCAF12" "HEMGN"
## [1] ""
## [1] ""
## [1] "PC4"
## [1] "ALAS2" "AHSP" "HBD" "SNCA" "SLC25A37" "CA1"
## [7] "HBM" "BLVRB" "SLC25A39" "BPGM" "SELENBP1" "BNIP3L"
## [13] "MKRN1" "FAM210B" "FECH" "MPP1" "IFIT1B" "DCAF12"
## [19] "GMPR" "BSG" "STRADB" "HEMGN" "LGALS3" "SLC4A1"
## [25] "ADIPOR1" "PITHD1" "KRT1" "BCL2L1" "EIF1AY" "NKG7"
## [1] ""
## [1] "CD74" "CD79A" "HLA-DPB1" "IGHD" "IGKC"
## [6] "CD79B" "IGHM" "TCL1A" "HLA-DRB1" "HLA-DQA1"
## [11] "MS4A1" "IGLC2" "MT-CYB" "MT-ND3" "MT-ATP6"
## [16] "LTB" "VPREB3" "HLA-DRB5" "MT-ND1" "BANK1"
## [21] "PLPP5" "FCER2" "LINC00926" "IGLC3" "HLA-DMA"
## [26] "MEF2C" "BIRC3" "HLA-DMB" "CD69" "GABPB1-AS1"
## [1] ""
## [1] ""
## [1] "PC5"
## [1] "FCGR3A" "GZMB" "HLA-DPB1" "FGFBP2" "HLA-DRB1" "CD74"
## [7] "SPON2" "PRF1" "IFITM3" "RHOC" "KLRF1" "NKG7"
## [13] "HLA-DQA1" "HBD" "CD79B" "AHSP" "CD79A" "ALAS2"
## [19] "HLA-DRB5" "FAM26F" "SNCA" "TCL1A" "IGFBP7" "CLIC3"
## [25] "CTSW" "CA1" "KLRD1" "IGHD" "HBM" "BPGM"
## [1] ""
## [1] "IL7R" "IL32" "CD3E" "GZMK" "LTB" "MT-CYB"
## [7] "CD8B" "CD3G" "MT-ND1" "CD2" "LEPROTL1" "MT-ND3"
## [13] "MT-ATP6" "DUSP2" "CD8A" "LYAR" "SPOCK2" "S100A12"
## [19] "S100A8" "GSTK1" "TNFAIP3" "TRBC1" "C12orf57" "SLC2A3"
## [25] "VCAN" "CD44" "LCK" "RORA" "FYB" "S100A9"
## [1] ""
## [1] ""
## 避免太多log日志被打印出来。
PBMC <- FindClusters(object = PBMC,
reduction.type = "pca",
dims.use = 1:10,
resolution = 1,
print.output = 0,
k.param = 35, save.SNN = TRUE) # 13 clusters
PBMC <- RunTSNE(object = PBMC, dims.use = 1:10)
# 配色这里直接使用文章配色方案。
TSNEPlot(PBMC, colors.use = c('green4', 'pink', '#FF7F00', 'orchid', '#99c9fb', 'dodgerblue2', 'grey30', 'yellow', 'grey60', 'grey', 'red', '#FB9A99', 'black'))
end_time <- Sys.time()
end_time - start_time
## Time difference of 1.144884 mins
start_time <- Sys.time()
save(PBMC,file = 'patient1.PBMC.output.Rdata')
end_time <- Sys.time()
end_time - start_time
## Time difference of 1.414166 mins
# 这个步骤会输出文件 1.75G
最后,这 13 clusters要进行注释,才能发表,如下所示:
作者文章里面是Representative marker genes shown in Supplementary Fig. 7. 如下所示
可以看到作者对PBMC里面的细胞都挑选了一个基因就命名了。
sessionInfo()
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS 10.14
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] Seurat_2.3.4 Matrix_1.2-14 cowplot_0.9.3 ggplot2_3.2.0
##
## loaded via a namespace (and not attached):
## [1] Rtsne_0.15 colorspace_1.4-0 class_7.3-14
## [4] modeltools_0.2-22 ggridges_0.5.1 mclust_5.4.1
## [7] rprojroot_1.3-2 htmlTable_1.13.1 base64enc_0.1-3
## [10] rstudioapi_0.9.0 proxy_0.4-22 npsurv_0.4-0
## [13] flexmix_2.3-14 bit64_0.9-7 codetools_0.2-15
## [16] splines_3.5.1 R.methodsS3_1.7.1 lsei_1.2-0
## [19] robustbase_0.93-3 knitr_1.21 jsonlite_1.6
## [22] Formula_1.2-3 ica_1.0-2 cluster_2.0.7-1
## [25] kernlab_0.9-27 png_0.1-7 R.oo_1.22.0
## [28] compiler_3.5.1 httr_1.3.1 backports_1.1.3
## [31] assertthat_0.2.0 lazyeval_0.2.1 lars_1.2
## [34] acepack_1.4.1 htmltools_0.3.6 tools_3.5.1
## [37] bindrcpp_0.2.2 igraph_1.2.2 gtable_0.2.0
## [40] glue_1.3.0 RANN_2.6 reshape2_1.4.3
## [43] dplyr_0.7.8 Rcpp_1.0.0 gdata_2.18.0
## [46] ape_5.2 nlme_3.1-137 iterators_1.0.10
## [49] fpc_2.2-3 gbRd_0.4-11 lmtest_0.9-36
## [52] xfun_0.4 stringr_1.3.1 irlba_2.3.2
## [55] gtools_3.8.1 DEoptimR_1.0-8 MASS_7.3-50
## [58] zoo_1.8-4 scales_1.0.0 doSNOW_1.0.16
## [61] parallel_3.5.1 RColorBrewer_1.1-2 yaml_2.2.0
## [64] reticulate_1.10 pbapply_1.3-4 gridExtra_2.3
## [67] rpart_4.1-13 segmented_0.5-3.0 latticeExtra_0.6-28
## [70] stringi_1.2.4 foreach_1.4.4 checkmate_1.9.1
## [73] caTools_1.17.1.1 bibtex_0.4.2 Rdpack_0.10-1
## [76] SDMTools_1.1-221 rlang_0.3.1 pkgconfig_2.0.2
## [79] dtw_1.20-1 prabclus_2.2-6 bitops_1.0-6
## [82] evaluate_0.12 lattice_0.20-35 ROCR_1.0-7
## [85] purrr_0.3.0 bindr_0.1.1 labeling_0.3
## [88] htmlwidgets_1.3 bit_1.1-14 tidyselect_0.2.5
## [91] plyr_1.8.4 magrittr_1.5 R6_2.3.0
## [94] snow_0.4-3 gplots_3.0.1.1 Hmisc_4.2-0
## [97] pillar_1.3.1 foreign_0.8-70 withr_2.1.2
## [100] fitdistrplus_1.0-11 mixtools_1.1.0 survival_2.42-3
## [103] nnet_7.3-12 tsne_0.1-3 tibble_2.0.1
## [106] crayon_1.3.4 hdf5r_1.0.1 KernSmooth_2.23-15
## [109] rmarkdown_1.10 grid_3.5.1 data.table_1.12.0
## [112] metap_1.0 digest_0.6.18 diptest_0.75-7
## [115] tidyr_0.8.1 R.utils_2.7.0 stats4_3.5.1
## [118] munsell_0.5.0